# family_class.pickle의 생성
import pandas as pd
import pickle, os, time, itertools, csv

# .csv 파일을 출원번호 순으로 정렬
df = pd.read_csv("raw/equivalents.csv", header=0)
df = df.sort_values('appnum')
df = df.reset_index(drop=True)


# family_class 파일의 생성
family_class = set()
family = {df.iloc[0, 2].strip()}

for index, row in df.iterrows():
	# 데이터 마지막 행을 인식
	try:
		next_row = df.iloc[index+1, :]
	except IndexError:
		break
	appnum = row[0]
	next_appnum = next_row[0]
	if appnum==next_appnum:
		family.add(next_row[2].strip())
	else:
		for element in family.copy():
			if element[:2] != "KR" and element[:2] != "US":
				family.remove(element)
		family = frozenset(family)
		# family의 길이가 0인 경우 family_set에 포함시키지 않음
		if len(family)==0:
			pass
		# 새로운 family인 경우 family_class에 추가
		else:
			family_class.add(family)
			# print("New family is defined {}".format(family))
		family = {next_row[2].strip()}

# family_id의 부여
class_id = {}
for id, family in enumerate(family_class):
	id = id+1
	class_id[id] = family

with open("temp/family_class_first.pickle", "wb") as f:
	pickle.dump(class_id, f)

with open("temp/family_class_first.csv", "w", encoding="utf-8", newline="") as f:
	wf = csv.writer(f)
	wf.writerow(["epodoc", "id"])
	for id, family in class_id.items():
		for epodoc in family:
			epodoc = epodoc.strip()
			wf.writerow([epodoc, id])

print("family_class is saved. Length of family_class = {}".format(len(family_class)))